***
***Datamaker - de-identifies the data
***

cd  ENTER THE DIRECTORY HOLDING IDENTIFIABLE FILES HERE

***Combine resume data, response, and application data
clear
*insheet using "applicant data - compiled.csv"
insheet using "applicant data - compiled - deident.csv"
rename onlineapp online_or_resume
foreach var of varlist address-wageposted{
	rename `var' app_`var'
}

sort id
save temp, replace

clear
insheet using responses.csv
drop if id == .
sort id
save temp1, replace

clear
insheet using resume_data.csv
sort id
merge id using temp
tab _merge
keep if _merge == 3

drop _merge
sort id
merge id using temp1
**should be no 2's
tab _merge
drop _merge

***Application we abandoned
drop if id == 449


***Applicant address census characteristics
save temp, replace
clear
use address_grid.dta
replace Address = subinstr(Address, ",", "", .) 

sort Address Cord
drop if Address == Address[_n-1]
keep Address  censustract
save temp1, replace
clear
use temp
tostring app_applicantaddresszip, gen(zip)
gen Address = app_applicantaddress + " Washington DC " + zip
sort Address
**should be no 1's
merge m:1 Address using temp1
drop if _merge == 2
drop Address zip _merge

rename censustract tract
merge m:1 tract using acs.dta
drop if _merge == 2
drop _merge

drop jobnum
sort id
gen jobnum = ceil(id/4)

***Cleaning coordinates to reflect actual values in few cases with input errors
egen avg_app_longit = mode(app_longitude), by(jobnum)
replace app_longitude = avg_app_longit if app_longitude ~= avg_app_longit
replace app_longitude = -77.031951 if id <= 692 & id >= 689


***Useful variables
gen female = 0
replace female = 1 if sex == "female"
gen far = 0
replace far = 1 if addresstype == "FA" | addresstype == "FP"
gen poor = 0
replace poor = 1 if addresstype == "NP" | addresstype == "FP"
gen black = 0
replace black = 1 if name_type == "BF" | name_type == "BM"
gen white = 0 
replace white = 1 if name_type == "WF" | name_type == "WM"

***Note: this is distance from the actual employer location (not the incorrect one entered into STATA if this happened)
***to the location of the address actually listed on the resume (not the address that should have been drawn, if there is a difference due to an error)
merge 1:1 id using distance.dta
drop _merge
*geodist app_latitude app_longitude app_latitude1 app_longitude1, gen(distance) mi

tostring experience_profile, gen(blah)
gen profileXjob = blah + " " + job_title
drop blah
cap tab profileXjob, gen(profileXjob_)

gen farXpoor = far * poor

gen response = 0
replace response = 1 if content == "Asked for Further Information" | content == "Call Back - Not Specified Why" | content == "Interview Request" | content == "Asked to Come in and Fill Out an Application" | content == "Referred to Other Job and Called Back" | content == "They Will Call Us Back"
gen any_response = 0
replace any_response = 1 if content ~= ""
gen interview = 0
replace interview = 1 if content == "Interview Request"
gen rejected = 0
replace rejected = 1 if content == "Rejected"


gen app_date2 = date(app_date, "MDY")

gen quadrant = regexs(0) if regexm(app_applicantaddress, "[NSWE][NSWE]")
replace quadrant = "NW" if regexm(app_applicantaddress, "Northwest")
replace quadrant = "NE" if regexm(app_applicantaddress, "Northeast")
replace quadrant = "SW" if regexm(app_applicantaddress, "Southwest")
replace quadrant = "SE" if regexm(app_applicantaddress, "Southeast")
replace quadrant = "NW" if regexm(app_applicantaddress, "Georgia Avenue") & quadrant == ""


***Fixing some data entry errors
rename app_online app_online_orig
sort id
merge 1:1 id using fixing_online_vs_email.dta
replace app_online_or_resume = app_online_orig if app_online_or_resume == ""
drop _merge
drop app_online_orig

***Employer census tract info
sort jobnum
save temp, replace
clear
insheet using employer_coord_deident.csv
keep jobnum censustract
rename censustract employerad_tract
sort jobnum
merge 1:m jobnum using temp
drop _merge
sort employerad_tract
save temp, replace

clear
use acs.dta
keep tract median_inc fracbacplus fracwhite
foreach var of varlist tract median_inc fracbacplus fracwhite {
	rename `var' employerad_`var'
}
sort employerad_tract
merge 1:m employerad_tract using temp
drop if _merge == 1
drop _merge

***Public transit travel time
sort id
merge 1:1 id using transittraveltime.dta
drop _merge

replace traveltime = "" if traveltime =="There"
replace traveltime = "" if traveltime == "#20004--No"
destring traveltime, replace


***
***Making graphs that require identified data before pulling out exact addresses
***
	*** Figure 5; Making tract maps for applicants
		save temp, replace
		gen fp = 0
		replace fp =1 if addresstype == "FP"
		gen fa = 0
		replace fa = 1 if addresstype == "FA"
		gen np = 0
		replace np= 1 if addresstype == "NP"
		gen na = (1-far) * (1-poor)
		keep tract fp np fa na
		collapse (sum) fp np fa na, by(tract)
		*rename tract NAME10
		tostring tract,  gen(NAME10) format(%18.2f) force
		replace NAME10 = regexr(NAME10,".00$","")
		drop tract
		save temp1, replace

		shp2dta using tl_2010_11001_tract10, data(dctracts.dta) coord(dctracts-coord.dta) ///
			genid(tract) gencentroids(centroid) replace

		clear
		use dctracts
		merge 1:1 NAME10 using temp1

		foreach var of varlist fp np fa na {
			*replace `var' = 0 if _merge == 1
			replace `var' = . if `var' == 0
		}

		label var fp "Far Poor"
		label var np "Near Poor"
		label var na "Near Affluent"
		label var fa "Far Affluent"

		foreach var of varlist fp np fa na {
			spmap `var' using dctracts-coord.dta, id(tract) ocolor(black) osize(vthin) ///
				clnumber(6) clmethod(custom) clbreaks(1 4 8 16 32 64 999 ) ///
				title(`: var label `var'') fcolor(Greens)  ///
				legtitle(# Applicants) ndlabel("[0,1)")
				
			graph save `var'_map.gph, replace
		}	

		graph combine np_map.gph na_map.gph fp_map.gph fa_map.gph


	***Figure 6; Making tract map for jobs
		clear
		insheet using employer_coord_deident.csv
		keep censustract
		tostring censustract,  gen(NAME10) format(%18.2f) force
		replace NAME10 = regexr(NAME10,".00$","")
		drop censustract
		gen num = 1
		collapse (sum) num, by(NAME10)
		save temp1, replace


		clear
		use dctracts
		merge 1:1 NAME10 using temp1

		foreach var of varlist num {
			*replace `var' = 0 if _merge == 1
			replace `var' = . if `var' == 0
		}

		foreach var of varlist num {
			spmap `var' using dctracts-coord.dta, id(tract) ocolor(black) osize(vthin) ///
				clnumber(6) clmethod(custom) clbreaks(1 4 8 16 32 64 999 ) ///
				fcolor(Greens) ndfcolor(white) legtitle(# Employers) ndocolor(black) ///
				 ndlabel("[0,1)")
		}	

		
*de-identifying
clear
use temp
encode app_applicantaddress, gen(code1)
label drop code1
drop app_applicantaddress
rename code1  app_applicantaddress

gen rand = runiform()
egen mean_rand = mean(rand), by(tract)
sort mean_rand
gen code1 = 0
replace code1 = code1[_n-1] + (tract ~= tract[_n-1]) in 2/l
drop tract
rename code1  tract

drop employerad_tract geoid geoid2 geodisplaylabel high_school date_end1 date_start1 date_end2 date_start2 date_end3 date_start3 v111 job1_end job2_start v114 job3_start job3_end employer_name1 employer_address1 dates_of_employment1 supervisor_phone_number1 employer_name2 employer_address2 dates_of_employment2 supervisor_phone_number2 employer_name3 employer_address3 dates_of_employment3 supervisor_phone_number3 job1skill1 job1skill2 job1skill3 job2skill1 job2skill2 job2skill3 job3skill1 job3skill2 job3skill3 computerskills random_day todays_date today_in_days birthday birth_month year_of_graduation_string hs_grad_day volunteerdescription reference1_employer reference2_employer reference3_employer reference1_phone reference2_phone reference3_phone email job1_start_day job1_end_day job2_start_day job2_end job3_start_day job3_end_day app_phone_number round_gpa v1 worker app_address app_jobcitystate app_jobzip app_commentonexactaddresss app_exactaddress app_date app_started app_ended app_timetaken app_username app_password app_applicantaddresscitystate app_applicantaddresszip app_latitude1 app_longitude1 app_timeposted app_dateposted auto dateofresponse timeofresponse comments avg_app_longit app_date2
drop rand mean_rand app_wageposted name app_applicantname

order jobnum id  addresstype app_addresstype address distance traveltime units tract far poor  farXpoor fracbacplus median_inc fracwhite quadrant app_applicantaddress ///
		  name_type race  sex female black white ///
		app_position app_jobcategory employerad_median_inc employerad_fracbacplus employerad_fracwhite app_ptft app_hours job_title job_title_article industry app_equalopportunityemployer app_criminalhistoryquestions app_askforreferences app_racesexquestion app_questionnaireorskillstest  app_online_or_resume ///
		contacttype contentofcontact response any_response interview rejected

keep jobnum id  addresstype app_addresstype address distance traveltime units tract far poor  farXpoor fracbacplus median_inc fracwhite quadrant app_applicantaddress ///
		  name_type race  sex female black white ///
		app_position app_jobcategory employerad_median_inc employerad_fracbacplus employerad_fracwhite app_ptft app_hours job_title job_title_article industry app_equalopportunityemployer app_criminalhistoryquestions app_askforreferences app_racesexquestion app_questionnaireorskillstest  app_online_or_resume ///
		contacttype contentofcontact response any_response interview rejected ///
		work_experience age work_gap  profileXjob_*

		
		


cd "$maindir" 
save phillips_jhr_cleandata, replace




